{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Copyright (c) 2016 Mikel Bober-Irizar\n",
    "## Feature-Time Instability Metric\n",
    "# Script to evaluate the change in properties of features over time\n",
    "# To use, call the function find_ovalue(FEATURE, TARGET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "########## CONFIG #########\n",
    "\n",
    "# Resolution of time axis to test\n",
    "time_res = 10\n",
    "\n",
    "# Resolution of feature value binning\n",
    "x_res = 10000\n",
    "# Higher values will find smaller fluctuations in data but may have more noise\n",
    "\n",
    "# Threshold of values that have to be in a histogram bin for it to be considered:\n",
    "thresh = 0.0001\n",
    "\n",
    "# Method to measure stabilitiy, either 'inter' for histogram intersection or 'purity' for tree split purity\n",
    "method = 'purity'\n",
    "\n",
    "# Only used for the purity metric, gives weight to the different splits based on how many samples exist in the bin\n",
    "weighted = True\n",
    "ignore_zero = False\n",
    "\n",
    "######### END CONFIG ##########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def hist_inter(a, b, bins):\n",
    "    # Uses Histogram intersection distance function to measure instability\n",
    "\n",
    "    # Find range\n",
    "    hist_max = max(max(a), max(b))\n",
    "    hist_min = min(min(a), min(b))\n",
    "\n",
    "    # np.histogram 'normed' is broken, normalisation must be done manually\n",
    "    hist_a = np.histogram(a, bins=bins, range=(hist_min, hist_max), normed=False)[0].tolist()\n",
    "    hist_b = np.histogram(b, bins=bins, range=(hist_min, hist_max), normed=False)[0].tolist()\n",
    "\n",
    "    # Manual normalisation of histograms\n",
    "    size_a = len(a)\n",
    "    size_b = len(b)\n",
    "    hist_a = [x / size_a for x in hist_a]\n",
    "    hist_b = [x / size_b for x in hist_b]\n",
    "\n",
    "    k = 0\n",
    "    i = 0\n",
    "    # Evaluate histogram intersection\n",
    "    for d in zip(hist_a, hist_b):\n",
    "        if sum(d) > thresh:\n",
    "                k += min(d)\n",
    "                i += 1\n",
    "\n",
    "    return k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def hist_purity(a, b, target_a, target_b, bins, weighted=True, ignore_nan=False):\n",
    "\n",
    "    # Get range of histogram to use\n",
    "    hist_max = max(max(a), max(b))\n",
    "    hist_min = min(min(a), min(b))\n",
    "\n",
    "    hist = pd.DataFrame()\n",
    "    hist['a'] = a\n",
    "    hist['b'] = b\n",
    "    hist['ta'] = target_a\n",
    "    hist['tb'] = target_b\n",
    "\n",
    "    # Separate data into labels\n",
    "    a_true = hist.loc[hist.ta == 1]['a'].values#.tolist()\n",
    "    a_false = hist.loc[hist.ta < 1]['a'].values#.tolist()\n",
    "    b_true = hist.loc[hist.tb == 1]['b'].values#.tolist()\n",
    "    b_false = hist.loc[hist.tb < 1]['b'].values#.tolist()\n",
    "\n",
    "    # Compute histograms\n",
    "    hist_a_true = np.histogram(a_true, bins=bins, range=(hist_min, hist_max), normed=False)[0]\n",
    "    hist_a_false = np.histogram(a_false, bins=bins, range=(hist_min, hist_max), normed=False)[0]\n",
    "    hist_b_true = np.histogram(b_true, bins=bins, range=(hist_min, hist_max), normed=False)[0]\n",
    "    hist_b_false = np.histogram(b_false, bins=bins, range=(hist_min, hist_max), normed=False)[0]\n",
    "\n",
    "    # Compute split purity\n",
    "    hist_a_tot = hist_a_true + hist_a_false\n",
    "    hist_b_tot = hist_b_true + hist_b_false\n",
    "    hist_a_purity = hist_a_true / hist_a_tot\n",
    "    hist_b_purity = hist_b_true / hist_b_tot\n",
    "\n",
    "    if ignore_nan is False:\n",
    "        hist_a_purity = np.nan_to_num(hist_a_purity)\n",
    "        hist_b_purity = np.nan_to_num(hist_b_purity)\n",
    "\n",
    "    # Compute histogram weights\n",
    "    hist_weights = ((hist_a_true + hist_a_false) / np.sum(hist_a_true + hist_a_false)) + ((hist_b_true + hist_b_false) / np.sum(hist_b_true + hist_b_false)) / 2\n",
    "\n",
    "    if weighted is False:\n",
    "        k = np.nansum(np.abs(hist_a_purity - hist_b_purity)) / len(a + b)\n",
    "    if weighted is True:\n",
    "        k = np.nansum(np.abs(hist_a_purity - hist_b_purity) * hist_weights)\n",
    "\n",
    "    #print(k)\n",
    "    return k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def find_ovalue_inter(feature, target):\n",
    "    ftr_true = []\n",
    "    ftr_false = []\n",
    "\n",
    "    # Separate into positive and negative samples\n",
    "    for x in zip(feature, target):\n",
    "        if x[1] == 1:\n",
    "            ftr_true.append(x[0])\n",
    "        else:\n",
    "            ftr_false.append(x[0])\n",
    "\n",
    "    # Split into time bins\n",
    "    chunks_true = [x.tolist() for x in np.array_split(ftr_true, time_res)]\n",
    "    chunks_false = [x.tolist() for x in np.array_split(ftr_false, time_res)]\n",
    "\n",
    "    cross = []\n",
    "    # Shoddy method for cross-checking chunks\n",
    "    for x in chunks_true:\n",
    "        for y in chunks_true:\n",
    "            if x != y and y > x:\n",
    "                dist = hist_inter(x, y, x_res)\n",
    "                cross.append(dist)\n",
    "\n",
    "    for x in chunks_false:\n",
    "        for y in chunks_false:\n",
    "            if x != y and y > x:\n",
    "                dist = hist_inter(x, y, x_res)\n",
    "                cross.append(dist)\n",
    "\n",
    "    return 1 - (sum(cross) / len(cross))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def find_ovalue_purity(feature, target):\n",
    "    feature_chunks = [x.tolist() for x in np.array_split(feature, time_res)]\n",
    "    target_chunks = [x.tolist() for x in np.array_split(target, time_res)]\n",
    "\n",
    "    cross = []\n",
    "    # Shoddy method for cross-checking chunks\n",
    "    for xi, x in enumerate(feature_chunks):\n",
    "        for yi, y in enumerate(feature_chunks):\n",
    "            if y > x:  # Avoid repeating the same chunk pair\n",
    "                xt = target_chunks[xi]\n",
    "                yt = target_chunks[yi]\n",
    "                dist = hist_purity(x, y, xt, yt, x_res, weighted, ignore_zero)\n",
    "                cross.append(dist)\n",
    "    try:\n",
    "        return sum(cross) / len(cross)\n",
    "    except:\n",
    "        # Will return -1 if there is an error\n",
    "        return -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if method == 'inter':\n",
    "    find_ovalue = find_ovalue_inter\n",
    "elif method == 'purity':\n",
    "    find_ovalue = find_ovalue_purity\n",
    "else:\n",
    "    print(\"Method must be set to either 'inter' or 'purity'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pygoose import *\n",
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_lists = [\n",
    "    'simple_summaries',\n",
    "    'jaccard_ngrams',\n",
    "    'fuzzy',\n",
    "    'tfidf',\n",
    "    'lda',\n",
    "    'nlp_tags',\n",
    "    'wordnet_similarity',\n",
    "    'phrase_embedding',\n",
    "    'wmd',\n",
    "    'wm_intersect',\n",
    "    \n",
    "    '3rdparty_abhishek',\n",
    "    '3rdparty_dasolmar_whq',\n",
    "    '3rdparty_mephistopheies',\n",
    "    '3rdparty_image_similarity',\n",
    "    \n",
    "    'magic_pagerank',\n",
    "    'magic_frequencies',\n",
    "    'magic_cooccurrence_matrix',\n",
    "    \n",
    "    'oofp_nn_mlp_with_magic',\n",
    "    'oofp_nn_cnn_with_magic',\n",
    "    'oofp_nn_bi_lstm_with_magic',\n",
    "    'oofp_nn_siamese_lstm_attention',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train, df_test, _ = project.load_feature_lists(feature_lists)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train = df_train.replace([np.inf, -np.inf], np.nan).fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = kg.io.load(project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train['is_duplicate'] = y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def process(c):\n",
    "    # Here is the function that evaluates the feature, please provide it with a LIST\n",
    "    # It returns a float which is the overfitting value\n",
    "    o = find_ovalue(df_train[c].tolist(), df_train['is_duplicate'].tolist())\n",
    "    print(c.ljust(60) + ' ' + str(o))\n",
    "    f = open(f'ftim-overfit-test-res-{time_res}-x-{x_res}-thresh-{thresh}-{method}.csv', 'a')\n",
    "    f.write(c+','+str(o)+'\\n')\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for column in df_train.columns.tolist():\n",
    "    process(column)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
